In [1]:
#For data manipulation
import pandas as pd

#For numerical manipulations
import numpy as np

#For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Machine learning algorithms 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split  #For Building Train and Test Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

#To generate report of data
import pandas_profiling as pd_prof

import warnings
warnings.filterwarnings('ignore')
In [30]:
train = pd.read_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\train.csv")
train.head(5)
Out[30]:
Severity Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID
0 Minor_Damage_And_Injuries 49.223744 14 22 71.285324 0.272118 78.04 2 31335.476824 3 0.424352 7570
1 Minor_Damage_And_Injuries 62.465753 10 27 72.288058 0.423939 84.54 2 26024.711057 2 0.352350 12128
2 Significant_Damage_And_Fatalities 63.059361 13 16 66.362808 0.322604 78.86 7 39269.053927 3 0.003364 2181
3 Significant_Damage_And_Serious_Injuries 48.082192 11 9 74.703737 0.337029 81.79 3 42771.499200 1 0.211728 5946
4 Significant_Damage_And_Fatalities 26.484018 13 25 47.948952 0.541140 77.16 3 35509.228515 2 0.176883 9054
In [3]:
train.shape
Out[3]:
(10000, 12)
In [4]:
train.describe()
Out[4]:
Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID
count 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.00000 10000.000000 10000.000000
mean 41.876406 12.931100 6.564300 65.145324 0.381495 79.969331 3.814900 32001.803282 2.01220 0.255635 6266.554200
std 16.138072 3.539803 6.971982 11.882934 0.121301 2.759739 1.902577 9431.995196 1.03998 0.381128 3610.170288
min 0.000000 1.000000 0.000000 0.000000 0.134000 74.740000 1.000000 831.695553 0.00000 0.000316 2.000000
25% 30.593607 11.000000 2.000000 56.927985 0.293665 77.960000 2.000000 25757.636908 1.00000 0.012063 3139.750000
50% 41.278539 13.000000 4.000000 65.587967 0.365879 79.540000 4.000000 32060.336419 2.00000 0.074467 6280.500000
75% 52.511416 15.000000 9.000000 73.336372 0.451346 81.560000 5.000000 38380.641513 3.00000 0.354059 9391.500000
max 100.000000 23.000000 54.000000 100.000000 0.882648 97.510000 7.000000 64297.651218 5.00000 2.365378 12500.000000
In [5]:
import pandas_profiling as pd_prof

pd_prof.ProfileReport(train)
Out[5]:

In [8]:
train['Severity'].value_counts()
Out[8]:
Highly_Fatal_And_Damaging                  3049
Significant_Damage_And_Serious_Injuries    2729
Minor_Damage_And_Injuries                  2527
Significant_Damage_And_Fatalities          1695
Name: Severity, dtype: int64
In [10]:
train['Severity'].replace('Highly_Fatal_And_Damaging',4,inplace=True)
train['Severity'].replace('Significant_Damage_And_Fatalities',3,inplace=True)
train['Severity'].replace('Significant_Damage_And_Serious_Injuries',2,inplace=True)
train['Severity'].replace('Minor_Damage_And_Injuries',1,inplace=True)
In [11]:
train['Severity'].value_counts()
Out[11]:
4    3049
2    2729
1    2527
3    1695
Name: Severity, dtype: int64
In [12]:
train.describe()
Out[12]:
Severity Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID
count 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.00000 10000.000000 10000.000000
mean 2.526600 41.876406 12.931100 6.564300 65.145324 0.381495 79.969331 3.814900 32001.803282 2.01220 0.255635 6266.554200
std 1.168173 16.138072 3.539803 6.971982 11.882934 0.121301 2.759739 1.902577 9431.995196 1.03998 0.381128 3610.170288
min 1.000000 0.000000 1.000000 0.000000 0.000000 0.134000 74.740000 1.000000 831.695553 0.00000 0.000316 2.000000
25% 1.000000 30.593607 11.000000 2.000000 56.927985 0.293665 77.960000 2.000000 25757.636908 1.00000 0.012063 3139.750000
50% 2.000000 41.278539 13.000000 4.000000 65.587967 0.365879 79.540000 4.000000 32060.336419 2.00000 0.074467 6280.500000
75% 4.000000 52.511416 15.000000 9.000000 73.336372 0.451346 81.560000 5.000000 38380.641513 3.00000 0.354059 9391.500000
max 4.000000 100.000000 23.000000 54.000000 100.000000 0.882648 97.510000 7.000000 64297.651218 5.00000 2.365378 12500.000000
In [13]:
pd_prof.ProfileReport(train)
Out[13]:

In [14]:
train.boxplot()
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x22a35eda630>
In [16]:
train.corr()['Severity']
Out[16]:
Severity                   1.000000
Safety_Score              -0.308136
Days_Since_Inspection     -0.033429
Total_Safety_Complaints   -0.026801
Control_Metric             0.013701
Turbulence_In_gforces     -0.031615
Cabin_Temperature          0.019172
Accident_Type_Code         0.018977
Max_Elevation             -0.019331
Violations                 0.004590
Adverse_Weather_Metric     0.029530
Accident_ID                0.001033
Name: Severity, dtype: float64
In [74]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split  #For Building Train and Test Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
In [72]:
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold
In [68]:
x = train.drop(['Severity'],axis=1)
y = train[['Severity']]
In [69]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=42)

Creating a model using RandomForestClassifier

In [120]:
for i in range(20,100):
    rfc = RandomForestClassifier(n_estimators=i,random_state=47,bootstrap=False)
    rfc.fit(xtrain,ytrain)
    pred = rfc.predict(xtest)
    score = 100*(f1_score(ytest,pred,average='weighted'))
    if score>93:
        print(score,i)
93.02609543446648 45
In [93]:
test = pd.read_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\test.csv")
test.head()
Out[93]:
Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID
0 19.497717 16 6 72.151322 0.388959 78.32 4 37949.724386 2 0.069692 1
1 58.173516 15 3 64.585232 0.250841 78.60 7 30194.805567 2 0.002777 10
2 33.287671 15 3 64.721969 0.336669 86.96 6 17572.925484 1 0.004316 14
3 3.287671 21 5 66.362808 0.421775 80.86 3 40209.186341 2 0.199990 17
4 10.867580 18 2 56.107566 0.313228 79.22 2 35495.525408 2 0.483696 21
In [94]:
test.shape
Out[94]:
(2500, 11)
In [ ]:
x = train.drop(['Severity'],axis=1)
y = train[['Severity']]
In [100]:
rfc1 = RandomForestClassifier(n_estimators=98,random_state=47)
rfc.fit(x,y)
pred = rfc.predict(test)
In [114]:
output = pd.DataFrame(zip(test['Accident_ID'],pred))
output.columns=['Accident','Severity']
output.set_index('Accident',inplace=True)
In [116]:
output.to_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\output1.csv")
In [ ]:
 
In [ ]: